Load data

infn = "../data/Int Hypox Invitro and Human Meta Data.csv"
metadata = read.table(infn, sep=",", header=TRUE, stringsAsFactors=FALSE)

Here we make the coding for the variables easier to work with by making them smaller and removing spaces and other problem-causing characters. We will also make an ID that has the important metadata in it, to make plots easier to interpet.

date2month = function(str) {
  month = months(as.Date(str, format="%m.%d.%y"))
  return(substring(month, 1, 3))
}

metadata$month = as.factor(unlist(lapply(metadata$Experiment.Date, date2month)))
metadata$line = as.factor(gsub(" ", "", metadata$Cell.Line))
metadata$spiked = as.factor(ifelse(grepl("No", metadata$Spike.In ), "nospike",
                                   "spike"))
metadata$hypoxia = as.factor(ifelse(grepl("Intermittent", metadata$Condition), "IH",
                             ifelse(grepl("Sustained", metadata$Condition), "SH",
                                    "NO")))
metadata$hypoxia = relevel(metadata$hypoxia, ref="NO")
metadata$filename = metadata$RCC.FILE.NAME
metadata$fullname = metadata$Full.Sample.Name
metadata$column = ifelse(grepl("Oligo", metadata$Column), "oligo",
                  ifelse(grepl("Reg", metadata$Column), "reg", "nocol"))
metadata$replicate = unlist(lapply(metadata$Replicate.Number,
                                   function(x) strsplit(as.character(x), "-")[[1]][2]))
metadata$id = as.character(paste(metadata$line, metadata$hypoxia, metadata$spiked,
                                 metadata$column, metadata$month, sep="_"))
metadata$id = ifelse(grepl("Blank", metadata$id), metadata$Full.Sample.Name,
                     metadata$id)
metadata$linehypo = paste(metadata$line, metadata$hypoxia, sep="_")

Now we are ready to roll and can read in the RCC files using the sweet NanoStringQCPro library. We’ll just read it in, extract the counts and then actually do the analysis with DESeq2.

library(NanoStringQCPro)
## 
rccFiles = paste("../data/rcc/", metadata[,"filename"], sep="")
#blanks = c("20160602_ES052616miRNAV3aC2_Blank_11.RCC", "20160602_ES052616miRNAV3aC2_Blank_12.RCC")
eset = newRccSet(rccFiles=rccFiles, blankLabel="Blank")
## Reading RCC files...
## checkRccSet() messages:
##   Optional featureData cols not found: BarCode, ProbeID, TargetSeq, Species, Comments
##   The following panel housekeeping genes were found: B2M|0, GAPDH|0, RPL19|0, ACTB|0, RPLP0|0
## Warning in .local(rccSet, ...): 
##   Non-standard CodeClass values found in featureData (values currently recognized are "Endogenous", "Housekeeping", "Positive", and "Negative"): Ligation, Endogenous1, SpikeIn)
pdata = pData(eset)
pdata$SampleType = metadata$line
pData(eset) <- pdata
proc = preprocRccSet(eset)
counts = as.data.frame(exprs(eset))
colnames(counts) = metadata$id
is_positive = function(column) {
  return(grepl("Pos", column))
}
is_negative = function(column) {
  return(grepl("Neg", column))
}
is_spikein = function(column) {
  return(grepl("Spike", column))
}
is_ligation = function(column) {
  return(grepl("Ligati", column))
}
is_housekeeping = function(column) {
  return(grepl("Housekee", column))
}
is_prior = function(column) {
  return(grepl("miR-210", column) | grepl("miR-365", column))
}

Scale miRNA with known crosstalk

A small set of miRNA have known inflated values; Nanostring has a heuristic to correct for the inflation via the formula using some supplied constants. Here we implement that fix to correct the expression of those miRNA.

correct_miRNA = function(eset) {
  require(dplyr)
  fdat = fData(eset)
  fdat = fdat %>%
    tidyr::separate(GeneName, c("Name", "scalefactor"), sep='\\|',
                    fill="right", remove=FALSE) %>%
    dplyr::mutate(scalefactor=ifelse(is.na(scalefactor), 0, scalefactor))
  sf = as.numeric(fdat$scalefactor)
  posa = as.numeric(counts["Positive_POS_A_ERCC_00117.1",])
  scales = t(replicate(length(sf), posa))
  scales = scales * sf
  scaled = counts - scales
  scaled[scaled<0] = 0
  rownames(scaled) = fdat$Name
  return(round(scaled))
}

plot_corrected_miRNA = function(scaled, counts) {
  require(reshape)
  require(ggplot2)
  rownames(counts) = rownames(scaled)
  is_scaled = rowSums(abs(counts - scaled)) > 0
  counts = melt(as.matrix(counts[is_scaled,]))
  colnames(counts) = c("gene", "sample", "value")
  counts$scaled = "no"
  scaled = melt(as.matrix(scaled[is_scaled,]))
  colnames(scaled) = c("gene", "sample", "value")
  scaled$scaled = "yes"
  all = rbind(counts, scaled)
  library(cowplot)
  ggplot(all, aes(sample, value, color=scaled)) + facet_wrap(~gene) +
    geom_point(size=0.5) +
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
    guides(colour = guide_legend(override.aes = list(size=6))) + ylab("") +
    ggtitle("miRNA hybridization crosstalk correction")
}
scaled = correct_miRNA(eset)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
plot_corrected_miRNA(scaled, counts)
## Loading required package: reshape
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
## Loading required package: ggplot2
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave

Imaging QC

This percentage should be super high, close to 100%. If there isn’t that usually means the cartridge needs to be rescanned. If it is < 3 weeks from the scan, rescanning should be okay. Nanostring folks call < 75% a failure.

library(cowplot)
pdat = pData(eset) %>% left_join(metadata, by=c("FileName"="RCC.FILE.NAME"))
pdat$pcounted = pdat$FovCounted/pdat$FovCount * 100
ggplot(pdat, aes(id, pcounted)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  scale_y_continuous(expand = c(0,0)) +
  expand_limits(y = c(0,1.05 * max(pdat$pcounted))) +
  ylab("percentage of FOV counted") + xlab("sample") +
  geom_hline(yintercept=75, color="red")

Binding density

Nanostring folks call a sample problematic if the binding density is < 0.05 or > 2.25.

ggplot(pdat, aes(id, BindingDensity)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  scale_y_continuous(expand = c(0,0)) +
  expand_limits(y = c(0,1.05 * max(pdat$BindingDensity))) +
  ylab("Binding density") + xlab("sample") +
  geom_hline(yintercept=0.05, color="red") +
  geom_hline(yintercept=2.25, color="red")

Total counts vs miRNA detected

Here I plotted the total counts vs the number of miRNA detected, where detected is it has counts > 30. There is a pretty big spread among the samples in terms of the miRNA detected. The Huh7 cells look like they have a lower number of genes detected on average for a given number of reads. This might be a technical artifact or it could also be that there are less miRNA in the Huh7 cells.

endocounts = counts[grepl("Endo", rownames(counts)),]
cdf = data.frame(total=colSums(counts), detected=colSums(counts > 30))
rownames(cdf) = colnames(counts)
cdf$id = rownames(cdf)
cdf = cdf %>% left_join(metadata, by="id")
ggplot(cdf, aes(total, detected, color=column, label=linehypo)) +
  geom_text() + scale_x_log10()

Positive controls

Below we look at the R^2 correlation between the expected positive control concentrations and the observed concentrations for each sample.

pcdf = data.frame(concentration=log2(c(128, 32, 8, 2, 0.5, 0.125)),
                  GeneName=paste("POS", c("A", "B", "C", "D", "E", "F"), sep="_"))
pccounts = subset(exprs(eset), grepl("Positive_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
                                          function(x) cor(x, pcdf$concentration)),
                        sample=colnames(pccounts)) %>%
  left_join(metadata, by=c("sample"="RCC.FILE.NAME"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
ggplot(corsamples, aes(sample, correlation)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  scale_y_continuous(expand = c(0,0)) +
  expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
  ylab("positive control correlation") +
  xlab("positive control correlation")

Ligation controls

Here we look at R^2 for the ligation controls as well. It looks like a couple samples failed.

pcdf = data.frame(concentration=log2(c(128, 32, 8)),
                  GeneName=paste("POS_", c("A", "B", "C"), sep="_"))
pccounts = subset(exprs(eset), grepl("Ligation_LIG_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
                                          function(x) cor(x, pcdf$concentration)),
                        sample=colnames(pccounts)) %>%
  left_join(metadata, by=c("sample"="RCC.FILE.NAME"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
ggplot(corsamples, aes(sample, correlation)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  scale_y_continuous(expand = c(0,0)) +
  expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
  ylab("ligation control correlation") +
  xlab("ligation control correlation")

subset(corsamples, correlation < 0.75)$sample
## [1] "20160602_ES052616miRNAV3aC2_Blank_11.RCC"
## [2] "20160602_ES052616miRNAV3aC2_Blank_12.RCC"

Ah, of course, the blanks.

library(ggplot2)
library(dplyr)
library(cowplot)

extract_pred = function(counts, predicate) {
  toplot = counts[predicate(rownames(counts)),] %>%
    tibble::rownames_to_column() %>%
    tidyr::gather("sample", "count", -rowname)
  colnames(toplot) = c("spot", "sample", "count")
  toplot = toplot %>% left_join(metadata, by=c("sample"="id"))
  return(toplot)
}
spotbarplot = function(toplot) {
  ggplot(toplot,
        aes(sample, count)) + geom_bar(stat='identity') +
    facet_wrap(~spot) +
    theme(axis.text.x = element_blank(),
          text = element_text(size=8))
}
spotboxplot = function(toplot) {
  ggplot(toplot,
        aes(linehypo, count)) + geom_boxplot() +
    facet_wrap(~spot) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
spotbarplot(extract_pred(counts, is_positive))

Negative controls

Negative counts can be relatively high in some of the samples compared to the other samples.

spotbarplot(extract_pred(counts, is_negative))

knitr::kable(subset(extract_pred(counts, is_negative), count > 50))
spot sample count RCC.FILE.NAME Full.Sample.Name Cell.Line Condition What Column Spike.In Experiment.Date Replicate.Number Date.Shipped RCC.File.Date month line spiked hypoxia filename fullname column replicate linehypo
1 Negative_NEG_C_ERCC_00019.1 Huh7_IH_nospike_nocol_Jan 64 20150421_PHS 1_01_10.RCC IH1 Huh 7 Intermittent Hypoxia Huh 7 cells Int Hypox None No spike in 1.8.15 H-1 4.6.15 4.21.2015 Jan Huh7 nospike IH 20150421_PHS 1_01_10.RCC IH1 nocol 1 Huh7_IH
3 Negative_NEG_E_ERCC_00098.1 Huh7_IH_nospike_nocol_Jan 79 20150421_PHS 1_01_10.RCC IH1 Huh 7 Intermittent Hypoxia Huh 7 cells Int Hypox None No spike in 1.8.15 H-1 4.6.15 4.21.2015 Jan Huh7 nospike IH 20150421_PHS 1_01_10.RCC IH1 nocol 1 Huh7_IH
4 Negative_NEG_A_ERCC_00096.1 Huh7_IH_nospike_nocol_Jan 52 20150421_PHS 1_01_10.RCC IH1 Huh 7 Intermittent Hypoxia Huh 7 cells Int Hypox None No spike in 1.8.15 H-1 4.6.15 4.21.2015 Jan Huh7 nospike IH 20150421_PHS 1_01_10.RCC IH1 nocol 1 Huh7_IH
9 Negative_NEG_C_ERCC_00019.1 Huh7_NO_nospike_nocol_Jan 95 20150421_PHS 1_01_11.RCC N1 Huh 7 Normoxia Huh7 Normox None No spike in 1.8.15 H-1 4.6.15 4.21.2015 Jan Huh7 nospike NO 20150421_PHS 1_01_11.RCC N1 nocol 1 Huh7_NO
11 Negative_NEG_E_ERCC_00098.1 Huh7_NO_nospike_nocol_Jan 87 20150421_PHS 1_01_11.RCC N1 Huh 7 Normoxia Huh7 Normox None No spike in 1.8.15 H-1 4.6.15 4.21.2015 Jan Huh7 nospike NO 20150421_PHS 1_01_11.RCC N1 nocol 1 Huh7_NO
17 Negative_NEG_C_ERCC_00019.1 Huh7_SH_nospike_nocol_Jan 86 20150421_PHS 1_01_12.RCC SH1 Huh 7 Sustained Hypoxia Huh7 Sust Hypox None No spike in 1.8.15 H-1 4.6.15 4.21.2015 Jan Huh7 nospike SH 20150421_PHS 1_01_12.RCC SH1 nocol 1 Huh7_SH
19 Negative_NEG_E_ERCC_00098.1 Huh7_SH_nospike_nocol_Jan 82 20150421_PHS 1_01_12.RCC SH1 Huh 7 Sustained Hypoxia Huh7 Sust Hypox None No spike in 1.8.15 H-1 4.6.15 4.21.2015 Jan Huh7 nospike SH 20150421_PHS 1_01_12.RCC SH1 nocol 1 Huh7_SH

Was anything different about those samples?

SpikeIn

It looks like there are only some samples that have spike-ins.

spotbarplot(extract_pred(counts, is_spikein))

And those correspond to the spike-ins in the metadata:

knitr::kable(subset(extract_pred(counts, is_spikein), count > 500))
spot sample count RCC.FILE.NAME Full.Sample.Name Cell.Line Condition What Column Spike.In Experiment.Date Replicate.Number Date.Shipped RCC.File.Date month line spiked hypoxia filename fullname column replicate linehypo
18 SpikeIn_cel-miR-254|0_MIMAT0000310 U937_NO_spike_nocol_Oct 4287 20151118_Espy-11-18-15-rescan_01_07.RCC UN U937 Normoxia U937 Normox None + Spike In 10.27.15 U-1 10.28.15 11.15.15 Oct U937 spike NO 20151118_Espy-11-18-15-rescan_01_07.RCC UN nocol 1 U937_NO
23 SpikeIn_cel-miR-254|0_MIMAT0000310 U937_IH_spike_nocol_Oct 5390 20151118_Espy-11-18-15-rescan_01_08.RCC UIH U937 Intermittent Hypoxia U937 Int Hypox None + Spike In 10.27.15 U-1 10.28.15 11.15.15 Oct U937 spike IH 20151118_Espy-11-18-15-rescan_01_08.RCC UIH nocol 1 U937_IH
28 SpikeIn_cel-miR-254|0_MIMAT0000310 U937_SH_spike_nocol_Oct 5733 20151118_Espy-11-18-15-rescan_01_09.RCC USH U937 Sustained Hypoxia U937 Sust Hypx None + Spike In 10.27.15 U-1 10.28.15 11.15.15 Oct U937 spike SH 20151118_Espy-11-18-15-rescan_01_09.RCC USH nocol 1 U937_SH
33 SpikeIn_cel-miR-254|0_MIMAT0000310 Huh7_NO_spike_nocol_Oct 6840 20151118_Espy-11-18-15-rescan_01_10.RCC H7N Huh 7 Normoxia Huh 7 Normox None + Spike In 10.27.15 H-2 10.28.15 11.15.15 Oct Huh7 spike NO 20151118_Espy-11-18-15-rescan_01_10.RCC H7N nocol 2 Huh7_NO
38 SpikeIn_cel-miR-254|0_MIMAT0000310 Huh7_IH_spike_nocol_Oct 5883 20151118_Espy-11-18-15-rescan_01_11.RCC H7IH Huh 7 Intermittent Hypoxia Huh 7 Int Hypox None + Spike In 10.27.15 H-2 10.28.15 11.15.15 Oct Huh7 spike IH 20151118_Espy-11-18-15-rescan_01_11.RCC H7IH nocol 2 Huh7_IH
43 SpikeIn_cel-miR-254|0_MIMAT0000310 Huh7_SH_spike_nocol_Oct 4708 20151118_Espy-11-18-15-rescan_01_12.RCC H7SH Huh 7 Sustained Hypoxia Huh 7 Sust hypox None + Spike In 10.27.15 H-2 10.28.15 11.15.15 Oct Huh7 spike SH 20151118_Espy-11-18-15-rescan_01_12.RCC H7SH nocol 2 Huh7_SH
48 SpikeIn_cel-miR-254|0_MIMAT0000310 U937_NO_spike_nocol_Jan 1501 20160114_ES011416miRNAV3a_U937 N_01.RCC UN U937 Normoxia U937 Normox None + Spike In 1.3.16 U-2 1.4.16 1.20.16 Jan U937 spike NO 20160114_ES011416miRNAV3a_U937 N_01.RCC UN nocol 2 U937_NO
53 SpikeIn_cel-miR-254|0_MIMAT0000310 U937_IH_spike_nocol_Jan 2333 20160114_ES011416miRNAV3a_U937 IH_02.RCC UIH U937 Intermittent Hypoxia U937 Int Hypox None + Spike In 1.3.16 U-2 1.4.16 1.20.16 Jan U937 spike IH 20160114_ES011416miRNAV3a_U937 IH_02.RCC UIH nocol 2 U937_IH
63 SpikeIn_cel-miR-254|0_MIMAT0000310 Huh7_NO_spike_oligo_Jan 3587 20160114_ES011416miRNAV3a_H7 Nor5m_04.RCC HN9 Huh 7 Normoxia Huh 7 Normox Oligo + Spike In 1.3.16 H-3 1.4.16 1.20.16 Jan Huh7 spike NO 20160114_ES011416miRNAV3a_H7 Nor5m_04.RCC HN9 oligo 3 Huh7_NO
73 SpikeIn_cel-miR-254|0_MIMAT0000310 Huh7_SH_spike_reg_Jan 1308 20160114_ES011416miRNAV3a_H7 SH_06.RCC HSHR Huh 7 Sustained Hypoxia Huh 7 Sust hypox Regular +Spike IN 1.3.16 H-3 1.4.16 1.20.16 Jan Huh7 spike SH 20160114_ES011416miRNAV3a_H7 SH_06.RCC HSHR reg 3 Huh7_SH

Ligation

It looks like some of the ligation controls worked better than the other. Kit suggested just normalizing by the mean of the posA ligation control. What we will do instead is do the normalization we usually do, but include the ligation efficiency for each sample as a factor in the regression. Then we can test whether or not our normalization scheme is correcting for the ligation efficiency. It should be, if the result of the ligation efficiency is overall less miRNA.

spotbarplot(extract_pred(counts, is_ligation))

Ligation efficiency

Here we calculated ligation efficiency for the samples. This is the log of LIG_POS_A(i) / mean(LIG_POS_A) for each sample i. It doesn’t matter which direction this is in, it will get corrected for in the model.

lig_posa = as.numeric(counts[grepl("Ligation_LIG_POS_A", rownames(counts)),])
metadata$lignorm = log2(lig_posa / mean(lig_posa))
ggplot(subset(metadata, line != "Blank"), aes(id, lignorm)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  ylab("ligation factor") +
  xlab("sample")

Housekeeping

For a subset of the samples, one housekeeping gene is very high.

spotbarplot(extract_pred(counts, is_housekeeping))

These don’t look confined to any particular cell type. Why are some of these so different?

table(subset(extract_pred(counts, is_housekeeping), count > 1000)$sample)
## 
##   Huh7_IH_spike_nocol_Oct Huh7_NO_nospike_nocol_Dec 
##                         1                         1 
##   Huh7_NO_spike_nocol_Oct Huh7_SH_nospike_nocol_Jan 
##                         3                         1 
##   Huh7_SH_spike_nocol_Oct  THP_IH_nospike_nocol_Aug 
##                         1                         1 
##  THP_IH_nospike_nocol_Dec  THP_IH_nospike_nocol_Jul 
##                         1                         1 
##  THP_IH_nospike_nocol_Sep  THP_NO_nospike_nocol_Aug 
##                         1                         1 
##  THP_NO_nospike_nocol_Dec  THP_NO_nospike_nocol_Jul 
##                         1                         1 
##  THP_NO_nospike_nocol_Sep  THP_SH_nospike_nocol_Aug 
##                         2                         1 
##  THP_SH_nospike_nocol_Dec  THP_SH_nospike_nocol_Jul 
##                         1                         1 
##  THP_SH_nospike_nocol_Sep U937_IH_nospike_nocol_Aug 
##                         2                         1 
## U937_IH_nospike_nocol_Jul U937_IH_nospike_nocol_Nov 
##                         2                         1 
##   U937_IH_spike_nocol_Jan   U937_IH_spike_nocol_Oct 
##                         1                         1 
## U937_NO_nospike_nocol_Aug U937_NO_nospike_nocol_Jul 
##                         1                         2 
## U937_NO_nospike_nocol_Nov   U937_NO_spike_nocol_Jan 
##                         1                         1 
##   U937_NO_spike_nocol_Oct U937_SH_nospike_nocol_Aug 
##                         1                         1 
## U937_SH_nospike_nocol_Jul U937_SH_nospike_nocol_Nov 
##                         2                         1 
##   U937_SH_spike_nocol_Jan   U937_SH_spike_nocol_Oct 
##                         1                         2

Normalization

Drop non-negative control and housekeeping

drop = is_spikein(rownames(counts))
drop = drop | is_positive(rownames(counts))
drop = drop | is_housekeeping(rownames(counts))
drop = drop | is_ligation(rownames(counts))
keep = counts[!drop,]
keep = keep[, !grepl("Blank", colnames(keep))]
metadata = subset(metadata, id %in% colnames(keep))
counts = keep

Drop genes below noise floor

We’ll normalize the libraries and look at the negative control expression and then come up with a cutoff that is above the noise floor for the libraries.

library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:reshape':
## 
##     expand, rename
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, regroup, slice
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
dds = DESeqDataSetFromMatrix(counts, colData=metadata, design=~month+column+line+hypoxia+lignorm)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## factor levels were dropped which had no samples
dds = estimateSizeFactors(dds)
ncounts = data.frame(counts(dds, normalized=TRUE))

Below we can see after normalizing by library size, if we set the cutoff to be 30, we are mostly above the noise floor measured by the negative controls:

ggplot(extract_pred(ncounts, is_negative), aes(count)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We used the threshold of 30 as the noise floor and dropped all" miRNA that did not have at least 30 counts in 20% of the samples.

keepgenes = rowSums(ncounts > 30) > 0.2*ncol(ncounts)
counts = counts[keepgenes,]

Does normalization for ligation effiency buy us anything?

Here we fit two models. The first model we fit the full model, including the ligation normalization term in the model. ~month+column+line+hypoxia+lignorm. Then we fit a reduced model without the lignorm term to answer the question ‘is there miRNA that are affected by the ligation efficiency?’

Yes, it looks like even after normalization, if we fit a model with just the ligation efficiency that there are miRNA affected. So we should correct for that. Here you can see only a subset of the miRNA are affected; if we do it like this instead of just scaling the counts, we can correct for the specific miRNA that have counts that are correlated with the ligation efficiency.

dds = DESeqDataSetFromMatrix(counts, colData=metadata, design=~month+column+line+hypoxia+lignorm)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## factor levels were dropped which had no samples
dds = DESeq(dds, full=~month+column+line+hypoxia+lignorm,
            reduced=~month+column+line+hypoxia,  test="LRT")
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
res = results(dds)
plotMA(res)

res = data.frame(res) %>% tibble::rownames_to_column() %>%
  arrange(padj) %>% filter(padj < 0.05)
knitr::kable(subset(res, padj < 0.05))
rowname baseMean log2FoldChange lfcSE stat pvalue padj
Endogenous1_hsa-miR-146a-5p|0_MIMAT0000449 657.35125 0.7237097 0.1508606 23.395360 0.0000013 0.0001715
Endogenous1_hsa-miR-302d-3p|0.006_MIMAT0000718 102.91760 -0.5390867 0.1240328 19.495139 0.0000101 0.0006556
Endogenous1_hsa-miR-548ar-5p|0.004_MIMAT0022265 39.47914 -0.5077435 0.1242403 16.389052 0.0000516 0.0022352
Endogenous1_hsa-miR-374b-5p|0_MIMAT0004955 72.55252 0.5847129 0.1523364 15.176599 0.0000979 0.0031821
Endogenous1_hsa-miR-26a-5p|0_MIMAT0000082 132.79106 0.4393709 0.1173975 14.519217 0.0001387 0.0036072
Endogenous1_hsa-miR-362-3p|0_MIMAT0004683 28.27474 0.4630149 0.1281661 13.423712 0.0002485 0.0053834
Endogenous1_hsa-let-7c-5p|0_MIMAT0000064 33.92337 -0.4690759 0.1360234 11.693007 0.0006274 0.0101945
Endogenous1_hsa-miR-378e|0.021_MIMAT0018927 129.44567 -0.4333283 0.1269200 11.724728 0.0006168 0.0101945
Endogenous1_hsa-miR-19b-3p|0_MIMAT0000074 224.28645 0.5561009 0.1606614 10.935508 0.0009434 0.0131173
Endogenous1_hsa-miR-28-3p|0_MIMAT0004502 31.40274 0.3815731 0.1184338 10.473747 0.0012108 0.0131173
Endogenous1_hsa-miR-106b-5p|0_MIMAT0000680 104.92074 0.4505765 0.1362040 10.563968 0.0011531 0.0131173
Endogenous1_hsa-miR-494-3p|0.099_MIMAT0002816 886.09046 0.3391129 0.1028740 10.540706 0.0011677 0.0131173
Endogenous1_hsa-miR-148b-3p|0_MIMAT0000759 53.42077 -0.3374903 0.1054467 10.135736 0.0014542 0.0145422
Endogenous1_hsa-miR-612|0_MIMAT0003280 36.37244 -0.3921924 0.1285868 9.484625 0.0020720 0.0192401
Endogenous1_hsa-miR-30e-5p|0_MIMAT0000692 136.68859 0.4774734 0.1570820 9.315863 0.0022718 0.0196888
Endogenous1_hsa-miR-29a-3p|0_MIMAT0000086 497.82160 0.3946110 0.1266899 9.089098 0.0025714 0.0208924
Endogenous1_hsa-miR-20a-5p+hsa-miR-20b-5p|0_MIMAT0000075 302.90494 0.5366625 0.1785337 7.922923 0.0048812 0.0373271

PCA

Here we do PCA to look at how the samples cluster. We can see a clear separation along the 1st and 2nd PCA based on line, but within a cell line, the different hypoxic states overlap.

dds = DESeqDataSetFromMatrix(counts, colData=metadata,
                             design=~column+spiked+line+hypoxia+lignorm)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## factor levels were dropped which had no samples
vst = varianceStabilizingTransformation(dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
pca_loadings = function(object, ntop=500) {
  rv <- matrixStats::rowVars(assay(object))
  select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
      length(rv)))]
  pca <- prcomp(t(assay(object)[select, ]))
  percentVar <- pca$sdev^2/sum(pca$sdev^2)
  names(percentVar) = colnames(pca$x)
  pca$percentVar = percentVar
  return(pca)}
pc = pca_loadings(vst)
comps = data.frame(pc$x)
comps$Name = rownames(comps)
library(dplyr)
comps = comps %>% left_join(metadata, by=c("Name"="id"))
pca_plot = function(comps, nc1, nc2, colorby) {
   c1str = paste0("PC", nc1)
   c2str = paste0("PC", nc2)
  ggplot(comps, aes_string(c1str, c2str, color=colorby)) +
    geom_point() + theme_bw() +
    xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), "% variance")) +
    ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 100), "% variance"))
  }
pca_plot(comps, 1, 2, "linehypo")

pca_plot(comps, 3, 4, "linehypo")

Differential expression

Nanostring data has an opposite mean-variance relationship than RNA-seq data. Here the dispersion is slightly higher for more highly expressed miRNA– the the trend is mostly flat, however.

dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(dds)

Cell line differences

Here we ask, controlling for the month, the column and the hypoxic state of the cell line, are there differences between the cell lines. The cell lines are very different from each other, most of the miRNA are significantly different between the cell lines.

Huh7 vs U937

res = results(dds, contrast=c("line", "Huh7", "U937"))

Huh7 vs THP

res = results(dds, contrast=c("line", "Huh7", "THP"))

THP vs U937

res = results(dds, contrast=c("line", "THP", "U937"))

Hypoxic state differences

Here we are asking, controlling for the month, column and the cell line, are there differences due to hypoxic state?

Normal vs intermittent

res = results(dds, contrast=c("hypoxia", "NO", "IH"))

Intermittent vs sustained

res = results(dds, contrast=c("hypoxia", "IH", "SH"))

Normal vs sustained

res = results(dds, contrast=c("hypoxia", "NO", "SH"))

Cell line specific hypoxic state differences

Here we refit the model with a new factor that is a combination of cell line and hypoxic state. Then we test if there are differences between the hypoxic states within each cell line.

metadata$linehypo = paste(metadata$line, metadata$hypoxia, sep="_")
dds = DESeqDataSetFromMatrix(counts, colData=metadata,
                             design=~column+spiked+linehypo+lignorm)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

THP

We can se that the overall fold changes are fairly small relative to the standard error of the fold changes. Signal to noise ratio here is combined as the log fold change divided by the log fold change standard error.

write_results = function(res, fname) {
  res = data.frame(res) %>% tibble::rownames_to_column() %>%
    arrange(pvalue)
  write.table(res, file=fname, quote=FALSE, col.names=TRUE,
              row.names=FALSE, sep=",")
}

res = results(dds, contrast=c("linehypo", "THP_NO", "THP_IH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
  ylab("signal to noise ratio") + ggtitle("THP NO vs IH")

write_results(res, "THP_NOvsIH.csv")
res = results(dds, contrast=c("linehypo", "THP_NO", "THP_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
  ylab("signal to noise ratio") + ggtitle("THP NO vs SH")

write_results(res, "THP_NOvsSH.csv")
res = results(dds, contrast=c("linehypo", "THP_IH", "THP_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
  ylab("signal to noise ratio") + ggtitle("THP IH vs SH")

write_results(res, "THP_IHvsSH.csv")

Huh7

res = results(dds, contrast=c("linehypo", "Huh7_NO", "Huh7_IH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
  ylab("signal to noise ratio") + ggtitle("Huh7 NO vs IH")

write_results(res, "Huh7_NOvsIH.csv")
res = results(dds, contrast=c("linehypo", "Huh7_NO", "Huh7_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
  ylab("signal to noise ratio") + ggtitle("Huh7 NO vs SH")

write_results(res, "Huh7_NOvsSH.csv")
res = results(dds, contrast=c("linehypo", "Huh7_IH", "Huh7_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
  ylab("signal to noise ratio") + ggtitle("Huh7 IH vs SH")

write_results(res, "Huh7_IHvsSH.csv")

U937

res = results(dds, contrast=c("linehypo", "U937_NO", "U937_IH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
  ylab("signal to noise ratio") + ggtitle("U937 NO vs IH")

write_results(res, "U937_NOvsIH.csv")
res = results(dds, contrast=c("linehypo", "U937_NO", "U937_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
  ylab("signal to noise ratio") + ggtitle("U937 NO vs SH")

write_results(res, "U937_NOvsSH.csv")
res = results(dds, contrast=c("linehypo", "U937_IH", "U937_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
  ylab("signal to noise ratio") + ggtitle("U937 IH vs SH")

write_results(res, "U937_IHvsSH.csv")

Prior miRNA

miR-210 is below the noise floor– one of the miR-365 spots though has higher expression values.

spotbarplot(extract_pred(counts, is_prior)) + scale_y_log10()

spotboxplot(extract_pred(counts, is_prior))